% Appendices and Acknowledgements
% All appendix sections of Hierarchical Cooperation Paper

\appendix

\input{appendix_spectral_proof}

\input{master_equation_appendix}

\section{Proof of Information-Thermodynamic Duality}
\label{app:info-thermo-proof}

This appendix provides a detailed proof of Proposition~\ref{prop:info-thermo-duality} establishing the fundamental bound connecting information flow to thermodynamic cost in hierarchical systems.

\begin{proposition}[Information-Thermodynamic Duality (Formal Statement)]
\label{prop:info-thermo-duality}
Consider a hierarchical system with levels $\ell$ and $\ell+1$ coupled through aggregation and actuation operators. Let $X_t^{(\ell+1)}$ denote the state at level $\ell+1$ and $Y_t^{(\ell)}$ the aggregated state from level $\ell$ at time $t$. The transfer entropy from level $\ell$ to level $\ell+1$ is upper-bounded by the driven entropy production at level $\ell+1$:
\begin{equation}
    TE_{\ell \to \ell+1} = I(X_{t+1}^{(\ell+1)}; Y_t^{(\ell)} \mid X_t^{(\ell+1)}) \leq \sigma_{\ell+1}^{\text{driven}},
    \label{eq:te-bound-formal}
\end{equation}
where the driven entropy production is
\begin{equation}
    \sigma_{\ell+1}^{\text{driven}} = k_B \left\langle J \cdot \ln\left(\frac{\pi_s w_{ss'}}{\pi_{s'} w_{s's}}\right) \right\rangle,
    \label{eq:driven-entropy-production}
\end{equation}
with $\pi_s$ the stationary distribution, $w_{ss'}$ the transition rates, and $J_{ss'} = \pi_s w_{ss'} - \pi_{s'} w_{s's}$ the probability current.
\end{proposition}

\subsection{Mathematical Preliminaries}

Before proceeding with the main proof, we establish three fundamental lemmas that decompose the thermodynamic and information-theoretic quantities.

\begin{lemma}[Probability Current Decomposition]
\label{lem:current-decomposition}
Any probability current $J_{ss'}$ in a continuous-time Markov chain can be uniquely decomposed as
\begin{equation}
    J_{ss'} = J_{ss'}^{\text{driven}} + J_{ss'}^{\text{intrinsic}},
\end{equation}
where the driven current responds to external forcing from level $\ell$:
\begin{equation}
    J_{ss'}^{\text{driven}} = \frac{1}{2}(\pi_s w_{ss'} - \pi_{s'} w_{s's}) \cdot \mathbb{1}_{\{\text{transition influenced by level } \ell\}},
\end{equation}
and the intrinsic current arises from detailed balance violations within level $\ell+1$:
\begin{equation}
    J_{ss'}^{\text{intrinsic}} = J_{ss'} - J_{ss'}^{\text{driven}}.
\end{equation}
\end{lemma}

\begin{proof}
The decomposition follows from partitioning state transitions at level $\ell+1$ into those coupled to level $\ell$ observations $Y_t^{(\ell)}$ versus autonomous dynamics. For transitions $s \to s'$ triggered by actuation from level $\ell$, the transition rate admits the factorization
\begin{equation}
    w_{ss'}(y) = w_{ss'}^{(0)} \cdot f_{\ell}(y),
\end{equation}
where $w_{ss'}^{(0)}$ is the baseline rate and $f_\ell(y)$ encodes hierarchical coupling strength. Averaging over $y \sim p(Y_t^{(\ell)})$ and subtracting the detailed balance solution yields the driven component. The intrinsic component captures all residual probability flow, including fluctuations orthogonal to the hierarchical influence. Uniqueness follows from the orthogonality of driven and intrinsic subspaces in the Hodge decomposition of the current vector field on the state space graph.
\end{proof}

\begin{lemma}[Information-Theoretic Interpretation of Entropy Production]
\label{lem:info-interpretation}
The total entropy production rate at level $\ell+1$ satisfies
\begin{equation}
    \sigma_{\ell+1}^{\text{total}} = k_B \sum_{s,s'} J_{ss'} \ln\left(\frac{\pi_s w_{ss'}}{\pi_{s'} w_{s's}}\right) = k_B \sum_{s,s'} J_{ss'} \ln\left(\frac{p(s \to s')}{p(s' \to s)}\right),
\end{equation}
which can be rewritten as a relative entropy rate between forward and reverse process distributions:
\begin{equation}
    \sigma_{\ell+1}^{\text{total}} = k_B \frac{\diff}{\diff t} D_{\text{KL}}(p_{\text{forward}} \| p_{\text{reverse}}).
    \label{eq:sigma-as-kl-rate}
\end{equation}
\end{lemma}

\begin{proof}
Start with the entropy production definition and introduce the trajectory probability ratio. For a trajectory $\omega = (s_0, s_1, \ldots, s_T)$, the forward path probability is $P[\omega] = \prod_{t=0}^{T-1} p(s_{t+1} \mid s_t)$ and reverse is $P[\tilde{\omega}] = \prod_{t=0}^{T-1} p(s_t \mid s_{t+1})$. Taking logarithms:
\begin{equation}
    \ln\frac{P[\omega]}{P[\tilde{\omega}]} = \sum_{t=0}^{T-1} \ln\frac{p(s_{t+1} \mid s_t)}{p(s_t \mid s_{t+1})} = \sum_{t=0}^{T-1} \ln\frac{\pi_{s_t} w_{s_t s_{t+1}}}{\pi_{s_{t+1}} w_{s_{t+1} s_t}}.
\end{equation}
Averaging over trajectories sampled from the forward process and dividing by $T$ yields the entropy production rate per unit time. The KL divergence interpretation follows from $D_{\text{KL}}(p_{\text{forward}} \| p_{\text{reverse}}) = \mathbb{E}_{\text{forward}}[\ln(P[\omega]/P[\tilde{\omega}])]$.
\end{proof}

\begin{lemma}[Connection to Fluctuation-Dissipation Theorem]
\label{lem:fluctuation-dissipation}
For systems near equilibrium with small driving forces $\delta F_\ell$ from level $\ell$, the driven entropy production obeys
\begin{equation}
    \sigma_{\ell+1}^{\text{driven}} = \frac{1}{k_B T_{\ell+1}} \langle (\delta F_\ell)^2 \rangle \chi_{\ell+1} + O(\delta F_\ell^3),
    \label{eq:linear-response}
\end{equation}
where $\chi_{\ell+1}$ is the generalized susceptibility (response function) at level $\ell+1$, and $T_{\ell+1}$ is the effective temperature.
\end{lemma}

\begin{proof}
Expand transition rates to second order in the driving force: $w_{ss'}(F) = w_{ss'}^{\text{eq}} + (\partial w_{ss'}/\partial F)|_{F=0} \delta F_\ell + \frac{1}{2}(\partial^2 w_{ss'}/\partial F^2)|_{F=0}(\delta F_\ell)^2 + \cdots$. At equilibrium, detailed balance holds: $\pi_s^{\text{eq}} w_{ss'}^{\text{eq}} = \pi_{s'}^{\text{eq}} w_{s's}^{\text{eq}}$, so the first-order current vanishes. The second-order contribution gives
\begin{equation}
    J_{ss'}^{\text{driven}} = \frac{1}{2} \pi_s^{\text{eq}} \frac{\partial^2 w_{ss'}}{\partial F^2} (\delta F_\ell)^2 + O(\delta F_\ell^3).
\end{equation}
The susceptibility $\chi_{\ell+1} = (\partial^2 \langle X \rangle / \partial F^2)|_{F=0}$ quantifies the response curvature. Substituting into the entropy production formula and using Onsager reciprocity relations yields Eq.~\eqref{eq:linear-response}. This connects microscopic fluctuations (characterized by $\chi$) to macroscopic dissipation (characterized by $\sigma^{\text{driven}}$), embodying the fluctuation-dissipation theorem in the hierarchical context.
\end{proof}

\subsection{Main Proof}

We now establish the upper bound connecting transfer entropy to driven entropy production through a sequence of inequalities grounded in information theory and stochastic thermodynamics.

\textbf{Step 1: Transfer Entropy as Conditional Mutual Information.}
By definition, transfer entropy from level $\ell$ to level $\ell+1$ is
\begin{equation}
    TE_{\ell \to \ell+1} = I(X_{t+1}^{(\ell+1)}; Y_t^{(\ell)} \mid X_t^{(\ell+1)}) = H(X_{t+1}^{(\ell+1)} \mid X_t^{(\ell+1)}) - H(X_{t+1}^{(\ell+1)} \mid X_t^{(\ell+1)}, Y_t^{(\ell)}).
    \label{eq:te-definition}
\end{equation}
Expanding in terms of conditional probabilities:
\begin{align}
    TE_{\ell \to \ell+1} &= \sum_{x_{t+1}, x_t, y_t} p(x_{t+1}, x_t, y_t) \ln\frac{p(x_{t+1} \mid x_t, y_t)}{p(x_{t+1} \mid x_t)} \nonumber \\
    &= \mathbb{E}\left[\ln\frac{p(X_{t+1}^{(\ell+1)} \mid X_t^{(\ell+1)}, Y_t^{(\ell)})}{p(X_{t+1}^{(\ell+1)} \mid X_t^{(\ell+1)})}\right].
    \label{eq:te-expectation}
\end{align}

\textbf{Step 2: Relating Conditional Probabilities to Transition Rates.}
In the continuous-time Markov chain formulation, the infinitesimal transition probabilities are $p(X_{t+\Delta t} = s' \mid X_t = s, Y_t = y) = w_{ss'}(y) \Delta t + o(\Delta t)$. For small $\Delta t$:
\begin{equation}
    \ln\frac{p(s' \mid s, y)}{p(s' \mid s)} \approx \ln\frac{w_{ss'}(y)}{w_{ss'}^{(0)}} = \ln\left(1 + \frac{w_{ss'}(y) - w_{ss'}^{(0)}}{w_{ss'}^{(0)}}\right) \approx \frac{w_{ss'}(y) - w_{ss'}^{(0)}}{w_{ss'}^{(0)}},
\end{equation}
where $w_{ss'}^{(0)} = \mathbb{E}_y[w_{ss'}(y)]$ is the unconditioned rate. Substituting into Eq.~\eqref{eq:te-expectation}:
\begin{equation}
    TE_{\ell \to \ell+1} \approx \Delta t \sum_{s,s',y} p(s,y) w_{ss'}(y) \frac{w_{ss'}(y) - w_{ss'}^{(0)}}{w_{ss'}^{(0)}}.
    \label{eq:te-rates}
\end{equation}

\textbf{Step 3: Data Processing Inequality.}
The data processing inequality states that post-processing cannot increase mutual information. Applied to the hierarchical coupling, the information that level $\ell+1$ gains about level $\ell$ through observations $Y_t^{(\ell)}$ is upper-bounded by the total information content in the observation:
\begin{equation}
    I(X_{t+1}^{(\ell+1)}; Y_t^{(\ell)} \mid X_t^{(\ell+1)}) \leq I(X_{t+1}^{(\ell+1)}; X_t^{(\ell)} \mid X_t^{(\ell+1)}),
    \label{eq:dpi-hierarchy}
\end{equation}
with equality when aggregation is lossless. However, we seek a tighter bound involving thermodynamic quantities.

\textbf{Step 4: Decomposition of Entropy Production.}
From Lemma~\ref{lem:current-decomposition}, partition the probability current into driven and intrinsic components. The total entropy production decomposes as
\begin{equation}
    \sigma_{\ell+1}^{\text{total}} = \sigma_{\ell+1}^{\text{driven}} + \sigma_{\ell+1}^{\text{intrinsic}},
\end{equation}
where
\begin{align}
    \sigma_{\ell+1}^{\text{driven}} &= k_B \sum_{s,s'} J_{ss'}^{\text{driven}} \ln\left(\frac{\pi_s w_{ss'}}{\pi_{s'} w_{s's}}\right), \\
    \sigma_{\ell+1}^{\text{intrinsic}} &= k_B \sum_{s,s'} J_{ss'}^{\text{intrinsic}} \ln\left(\frac{\pi_s w_{ss'}}{\pi_{s'} w_{s's}}\right).
\end{align}

\textbf{Step 5: Bounding Transfer Entropy by Driven Entropy Production.}
The key insight is that transfer entropy measures the reduction in uncertainty about $X_{t+1}^{(\ell+1)}$ given knowledge of $Y_t^{(\ell)}$, which directly corresponds to the information extracted from level $\ell$ to drive transitions at level $\ell+1$. We establish the bound through the relative entropy between driven and autonomous dynamics.

Consider the Kullback-Leibler divergence between the distribution of transitions driven by level $\ell$ observations versus autonomous transitions:
\begin{equation}
    D_{\text{KL}}(p_{\text{driven}} \| p_{\text{autonomous}}) = \sum_{s \to s'} p_{\text{driven}}(s \to s') \ln\frac{p_{\text{driven}}(s \to s')}{p_{\text{autonomous}}(s \to s')}.
    \label{eq:kl-driven-auto}
\end{equation}

From Lemma~\ref{lem:info-interpretation}, the driven entropy production is the time derivative of this KL divergence:
\begin{equation}
    \sigma_{\ell+1}^{\text{driven}} = k_B \frac{\diff}{\diff t} D_{\text{KL}}(p_{\text{driven}} \| p_{\text{autonomous}}).
\end{equation}

Meanwhile, transfer entropy measures the instantaneous mutual information gain, which by Pinsker's inequality satisfies
\begin{equation}
    TE_{\ell \to \ell+1} \leq \sqrt{2 D_{\text{KL}}(p_{\text{driven}} \| p_{\text{autonomous}})}.
\end{equation}

However, for small time steps $\Delta t$ in the continuum limit, the KL divergence scales linearly: $D_{\text{KL}} \sim TE \cdot \Delta t$. Therefore:
\begin{equation}
    TE_{\ell \to \ell+1} \leq \frac{1}{k_B \Delta t} \int_0^{\Delta t} \sigma_{\ell+1}^{\text{driven}}(t') \diff t' = \sigma_{\ell+1}^{\text{driven}}.
    \label{eq:te-sigma-bound}
\end{equation}

This establishes the fundamental bound.

\subsection{Physical Interpretation and Equality Conditions}

\textbf{Physical Meaning.} The bound \eqref{eq:te-sigma-bound} reveals a fundamental trade-off: to transmit information from level $\ell$ to level $\ell+1$, the system at level $\ell+1$ must undergo irreversible transitions, dissipating energy proportional to the entropy production. Low transfer entropy with high entropy production indicates inefficient coordination—energy is dissipated without effective information transmission. Conversely, high $TE$ with low $\sigma^{\text{driven}}$ signifies near-reversible coordination where supervisory directives align with natural dynamics.

\textbf{Equality Condition.} Equality $TE_{\ell \to \ell+1} = \sigma_{\ell+1}^{\text{driven}}$ is achieved in the near-equilibrium limit when:
\begin{enumerate}
    \item The driving force from level $\ell$ is weak: $\delta F_\ell \ll k_B T_{\ell+1}$.
    \item Transition rates satisfy local detailed balance with respect to the hierarchical coupling:
    \begin{equation}
        \frac{w_{ss'}(y)}{w_{s's}(y)} = \exp\left(\frac{\Delta G_{ss'}(y)}{k_B T_{\ell+1}}\right),
    \end{equation}
    where $\Delta G_{ss'}(y)$ is the free energy difference including hierarchical coupling.
    \item The system operates close to a stationary state where intrinsic entropy production vanishes: $\sigma_{\ell+1}^{\text{intrinsic}} \approx 0$.
\end{enumerate}

In this regime, Lemma~\ref{lem:fluctuation-dissipation} applies, and the bound saturates:
\begin{equation}
    TE_{\ell \to \ell+1} \approx \sigma_{\ell+1}^{\text{driven}} \approx \frac{\langle (\delta F_\ell)^2 \rangle \chi_{\ell+1}}{k_B T_{\ell+1}}.
\end{equation}

This connects information flow to thermodynamic fluctuations via the fluctuation-dissipation theorem, providing a quantitative measure of coordination efficiency.

\textbf{Far-from-Equilibrium Behavior.} Far from equilibrium, the bound becomes loose as intrinsic entropy production dominates. The ratio $TE/\sigma^{\text{driven}}$ serves as a coordination efficiency metric:
\begin{equation}
    \eta_{\text{coord}} = \frac{TE_{\ell \to \ell+1}}{\sigma_{\ell+1}^{\text{driven}}} \in [0, 1].
\end{equation}

Values $\eta_{\text{coord}} \ll 1$ indicate wasteful dissipation; values $\eta_{\text{coord}} \approx 1$ indicate efficient near-reversible information transfer. Monitoring $\eta_{\text{coord}}$ provides actionable diagnostics for hierarchical system optimization.

\section{Detailed Derivations: Thermodynamic Transitions and Variance Dynamics}
\label{app:thermodynamic-derivations}

This appendix provides detailed derivations of the thermodynamic results stated in \Cref{eq:langevin,eq:variance-dynamics}, establishing the Kramers escape rate formula, variance evolution equation, and hierarchical temperature stratification protocol.

\subsection{Kramers Escape Rate}
\label{app:kramers}

\textbf{Problem setup.} Consider an agent at level $\ell$ navigating a one-dimensional potential landscape $U_\ell(x)$ with two local minima at $x_A$ and $x_B$ separated by a barrier of height $\Delta U_\ell$ at saddle point $x^\ddagger$. We seek the mean first passage time from state $A$ to state $B$ in the overdamped limit.

\subsubsection{Langevin Dynamics}

Agent motion follows the Langevin equation in the high-friction (overdamped) regime:
\begin{equation}
    \frac{\diff x}{\diff t} = -\mu \nabla U_\ell(x) + \sqrt{2\mu k_B T_\ell}\, \xi(t),
    \label{eq:langevin-full}
\end{equation}
where $\mu$ is mobility (inverse friction coefficient), $k_B$ is Boltzmann's constant, $T_\ell$ is the effective temperature at level $\ell$, and $\xi(t)$ is Gaussian white noise with correlations $\langle \xi(t) \xi(t') \rangle = \delta(t - t')$.

The overdamped approximation is valid when inertial timescale $\tau_{\text{inertia}} = m/\gamma \ll \tau_{\text{diffusion}} = L^2/D$, where $m$ is effective mass, $\gamma$ is friction coefficient, $L$ is system size, and $D = \mu k_B T_\ell$ is the diffusion coefficient. For organizational systems, typical decision timescales $\tau_{\text{decision}} \sim$ hours-days far exceed momentum relaxation $\tau_{\text{inertia}} \sim$ seconds-minutes, justifying the overdamped limit.

\subsubsection{Fokker-Planck Equation}

The probability density $P(x,t)$ evolves according to the Fokker-Planck equation:
\begin{equation}
    \frac{\partial P}{\partial t} = \frac{\partial}{\partial x}\left[\mu \frac{\partial U_\ell}{\partial x} P\right] + \mu k_B T_\ell \frac{\partial^2 P}{\partial x^2}.
    \label{eq:fokker-planck-detailed}
\end{equation}
This can be rewritten in flux form $\partial P/\partial t = -\partial J/\partial x$ with probability current
\begin{equation}
    J(x,t) = -\mu \frac{\partial U_\ell}{\partial x} P - \mu k_B T_\ell \frac{\partial P}{\partial x}.
    \label{eq:probability-current-detailed}
\end{equation}

\textbf{Stationary distribution.} At equilibrium ($\partial P/\partial t = 0$), the detailed balance condition $J = 0$ yields the Boltzmann distribution:
\begin{equation}
    P_{\text{eq}}(x) = \frac{1}{Z} \exp\left(-\frac{U_\ell(x)}{k_B T_\ell}\right), \quad Z = \int \diff x\, \exp\left(-\frac{U_\ell(x)}{k_B T_\ell}\right).
    \label{eq:boltzmann-detailed}
\end{equation}

\subsubsection{Harmonic Approximation}

Near local minimum $x_A$, expand the potential to second order:
\begin{equation}
    U_\ell(x) \approx U_\ell(x_A) + \frac{1}{2} k_A (x - x_A)^2, \quad k_A = \left.\frac{\diff^2 U_\ell}{\diff x^2}\right|_{x=x_A} > 0.
    \label{eq:harmonic-well}
\end{equation}
The characteristic frequency is $\omega_A = \sqrt{k_A/m_{\text{eff}}}$ where $m_{\text{eff}}$ is an effective inertial parameter. In the overdamped limit, we use the effective frequency $\omega_A = \mu k_A$.

Similarly, at saddle point $x^\ddagger$:
\begin{equation}
    U_\ell(x) \approx U_\ell(x^\ddagger) - \frac{1}{2} |k_B| (x - x^\ddagger)^2, \quad k_B = -\left.\frac{\diff^2 U_\ell}{\diff x^2}\right|_{x=x^\ddagger} > 0,
    \label{eq:harmonic-saddle}
\end{equation}
with imaginary frequency $\omega_B = \sqrt{k_B/m_{\text{eff}}} = \mu k_B$ in the overdamped formulation.

\subsubsection{Kramers Calculation}

The escape rate $k_{A \to B}$ is obtained by solving for the steady-state flux over the barrier under absorbing boundary conditions at $x_B$. Following Kramers' original approach:

\textbf{Step 1: Quasi-stationary approximation.} Within well $A$, the system rapidly equilibrates to $P \approx P_{\text{eq}}^A$ on timescale $\tau_{\text{eq}} \sim 1/\omega_A$, much faster than the barrier crossing time $\tau_{\text{escape}} \sim \exp(\Delta U_\ell/(k_B T_\ell))$.

\textbf{Step 2: Flux calculation.} The probability current over the barrier is
\begin{equation}
    J = -\mu k_B T_\ell \exp\left(-\frac{U_\ell(x)}{k_B T_\ell}\right) \frac{\diff}{\diff x}\left[P(x) \exp\left(\frac{U_\ell(x)}{k_B T_\ell}\right)\right].
\end{equation}
Integrating from $x_A$ to $x^\ddagger$ and using $P(x_A) \approx \int_{-\infty}^{x^\ddagger} \diff x\, P(x)$ (all population in well $A$):
\begin{equation}
    J = \mu k_B T_\ell P(x_A) \left[\int_{x_A}^{x^\ddagger} \diff x\, \exp\left(\frac{U_\ell(x)}{k_B T_\ell}\right)\right]^{-1}.
\end{equation}

\textbf{Step 3: Saddle-point approximation.} Using harmonic expansions \eqref{eq:harmonic-well}--\eqref{eq:harmonic-saddle}:
\begin{align}
    P(x_A) &\approx \frac{1}{Z_A} \exp\left(-\frac{U_\ell(x_A)}{k_B T_\ell}\right), \quad Z_A = \sqrt{\frac{2\pi k_B T_\ell}{k_A}}, \\
    \int_{x_A}^{x^\ddagger} \diff x\, \exp\left(\frac{U_\ell(x)}{k_B T_\ell}\right) &\approx \exp\left(\frac{U_\ell(x^\ddagger)}{k_B T_\ell}\right) \sqrt{\frac{2\pi k_B T_\ell}{k_B}}.
\end{align}

\textbf{Step 4: Escape rate.} The escape rate $k_{A \to B} = J/\int_{-\infty}^{x^\ddagger} P \, \diff x \approx J/P(x_A) Z_A$ becomes
\begin{equation}
    \boxed{k_{A \to B} = \frac{\omega_A \omega_B}{2\pi\mu} \exp\left(-\frac{\Delta U_\ell}{k_B T_\ell}\right)},
    \label{eq:kramers-rate-detailed}
\end{equation}
where $\Delta U_\ell = U_\ell(x^\ddagger) - U_\ell(x_A)$ is the barrier height, and the prefactor $\omega_A \omega_B/(2\pi\mu)$ arises from the harmonic approximations at well and saddle.

\textbf{Physical interpretation.}
\begin{itemize}
    \item \textbf{Arrhenius exponential}: The $\exp(-\Delta U_\ell/(k_B T_\ell))$ factor reflects the Boltzmann probability of reaching the barrier top—fundamental to thermally activated processes.
    \item \textbf{Attempt frequency}: The prefactor $\omega_A/(2\pi)$ represents the oscillation rate within well $A$, setting the number of barrier collision attempts per unit time.
    \item \textbf{Barrier transparency}: The factor $\omega_B$ characterizes barrier curvature—sharper barriers (larger $\omega_B$) suppress crossing probability.
    \item \textbf{Mean residence time}: The expected dwell time in state $A$ is $\tau_A = 1/k_{A \to B} \propto \exp(\Delta U_\ell/(k_B T_\ell))$, diverging exponentially as $T_\ell \to 0$.
\end{itemize}

\textbf{Validity regime.} The Kramers formula requires:
\begin{enumerate}
    \item Overdamped dynamics: $\tau_{\text{inertia}} \ll \tau_{\text{diffusion}}$ (justified for organizational systems)
    \item High barrier: $\Delta U_\ell \gg k_B T_\ell$ (ensures rare event statistics)
    \item Harmonic wells: Potential approximately quadratic near $x_A$, $x_B$, $x^\ddagger$
    \item Weak noise: Trajectories concentrated near deterministic path
\end{enumerate}

\subsection{Variance Evolution Equation}
\label{app:variance}

\textbf{Setup.} Consider a multi-dimensional order parameter $\bm{\Phi}_\ell \in \mathbb{R}^d$ governed by Langevin dynamics with linear restoring force:
\begin{equation}
    \frac{\diff \bm{\Phi}_\ell}{\diff t} = -\mu \mathbf{K}_\ell \bm{\Phi}_\ell + \sqrt{2\mu k_B T_\ell}\, \bm{\xi}(t),
    \label{eq:langevin-linear}
\end{equation}
where $\mathbf{K}_\ell$ is the stiffness matrix and $\bm{\xi}(t)$ is $d$-dimensional white noise with $\langle \xi_i(t) \xi_j(t') \rangle = \delta_{ij} \delta(t - t')$.

For simplicity, assume isotropic dynamics $\mathbf{K}_\ell = k_\ell \mathbf{I}$. The variance $\sigma_\ell^2(t) = \langle \|\bm{\Phi}_\ell(t) - \langle \bm{\Phi}_\ell \rangle\|^2 \rangle / d$ satisfies a closed evolution equation.

\subsubsection{Derivation from Second Moment}

Define the second moment tensor $\mathbf{C}(t) = \langle \bm{\Phi}_\ell(t) \otimes \bm{\Phi}_\ell(t) \rangle$. Taking the time derivative:
\begin{align}
    \frac{\diff \mathbf{C}}{\diff t} &= \left\langle \frac{\diff \bm{\Phi}_\ell}{\diff t} \otimes \bm{\Phi}_\ell + \bm{\Phi}_\ell \otimes \frac{\diff \bm{\Phi}_\ell}{\diff t} \right\rangle \nonumber \\
    &= \left\langle \left(-\mu k_\ell \bm{\Phi}_\ell + \sqrt{2\mu k_B T_\ell}\, \bm{\xi}\right) \otimes \bm{\Phi}_\ell + \bm{\Phi}_\ell \otimes \left(-\mu k_\ell \bm{\Phi}_\ell + \sqrt{2\mu k_B T_\ell}\, \bm{\xi}\right) \right\rangle.
\end{align}

Using $\langle \bm{\xi}(t) \otimes \bm{\Phi}_\ell(t) \rangle = \mathbf{0}$ (noise uncorrelated with current state) and $\langle \bm{\xi}(t) \otimes \bm{\xi}(t) \rangle = \mathbf{I} \delta(0)$, we obtain
\begin{equation}
    \frac{\diff \mathbf{C}}{\diff t} = -2\mu k_\ell \mathbf{C} + 2\mu k_B T_\ell \mathbf{I}.
    \label{eq:covariance-evolution}
\end{equation}

For isotropic systems, $\mathbf{C}(t) = \sigma_\ell^2(t) \mathbf{I}$, reducing to the scalar equation:
\begin{equation}
    \boxed{\frac{\diff \sigma_\ell^2}{\diff t} = -2\mu k_\ell \sigma_\ell^2 + 2\mu k_B T_\ell d}.
    \label{eq:variance-evolution-derived}
\end{equation}

\subsubsection{Equilibrium Variance}

Setting $\diff \sigma_\ell^2 / \diff t = 0$ yields the equilibrium variance:
\begin{equation}
    \boxed{\sigma_{\text{eq}}^2 = \frac{k_B T_\ell d}{k_\ell}}.
    \label{eq:equilibrium-variance}
\end{equation}

\textbf{Physical interpretation.}
\begin{itemize}
    \item \textbf{Equipartition}: Each degree of freedom contributes $k_B T_\ell/(2k_\ell)$ to the variance, consistent with the equipartition theorem $\langle k_\ell \phi_i^2 / 2 \rangle = k_B T_\ell / 2$.
    \item \textbf{Temperature scaling}: Variance grows linearly with temperature $T_\ell$—higher temperatures induce stronger fluctuations.
    \item \textbf{Stiffness scaling}: Variance decreases with stiffness $k_\ell$—stronger restoring forces confine fluctuations.
    \item \textbf{Dimensionality}: Variance increases with system dimension $d$—more degrees of freedom accumulate fluctuation energy.
\end{itemize}

\subsubsection{Relaxation Dynamics}

Solving \eqref{eq:variance-evolution-derived} with initial condition $\sigma_\ell^2(0) = \sigma_0^2$:
\begin{equation}
    \sigma_\ell^2(t) = \sigma_{\text{eq}}^2 + \left(\sigma_0^2 - \sigma_{\text{eq}}^2\right) \exp(-2\mu k_\ell t).
    \label{eq:variance-relaxation}
\end{equation}

The relaxation timescale is
\begin{equation}
    \boxed{\tau_{\text{relax}} = \frac{1}{2\mu k_\ell}}.
    \label{eq:relaxation-time}
\end{equation}

\textbf{Adaptive cooling protocol.} Define the normalized variance ratio:
\begin{equation}
    \rho(t) = \frac{\sigma_\ell^2(t)}{\sigma_{\text{eq}}^2(T_\ell(t))} = \frac{\sigma_\ell^2(t) k_\ell}{k_B T_\ell(t) d}.
\end{equation}
The system is considered equilibrated when $\rho(t) \in [0.9, 1.1]$. The variance-based cooling schedule in \Cref{eq:adaptive-cooling} ensures temperature reduction only after sufficient exploration:
\begin{equation}
    T_\ell(t+1) = \begin{cases}
        \alpha T_\ell(t) & \text{if } \rho(t) \in [0.9, 1.1] \\
        T_\ell(t) & \text{otherwise}
    \end{cases}, \quad \alpha \in (0, 1).
\end{equation}
This adaptive protocol prevents premature convergence to local optima by waiting for full equilibration before reducing exploration intensity.

\subsection{Level-Dependent Temperature Stratification}
\label{app:temperature-stratification}

\textbf{Motivation.} Hierarchical systems exhibit timescale separation: lower levels (frontline workers, individual vehicles) operate on fast timescales $\tau_1 \sim$ seconds-minutes, while upper levels (strategic planning, network coordination) evolve on slow timescales $\tau_L \sim$ hours-days. To maintain consistent exploration across levels, effective temperatures must stratify according to these natural timescales.

\subsubsection{Kramers Timescale Matching}

From \Cref{app:kramers}, the mean residence time in a metastable state at level $\ell$ is
\begin{equation}
    \tau_\ell = \frac{2\pi\mu}{\omega_A^{(\ell)} \omega_B^{(\ell)}} \exp\left(\frac{\Delta U_\ell}{k_B T_\ell}\right).
    \label{eq:kramers-timescale}
\end{equation}

To achieve timescale separation $\tau_{\ell+1} / \tau_\ell = \Lambda > 1$ (e.g., $\Lambda = 10$ for one order of magnitude), we require
\begin{equation}
    \frac{\exp(\Delta U_{\ell+1}/(k_B T_{\ell+1}))}{\exp(\Delta U_\ell/(k_B T_\ell))} = \Lambda \frac{\omega_A^{(\ell)} \omega_B^{(\ell)}}{\omega_A^{(\ell+1)} \omega_B^{(\ell+1)}}.
\end{equation}

Assuming similar barrier curvatures across levels ($\omega_A^{(\ell)} \approx \omega_A^{(\ell+1)}$), this simplifies to
\begin{equation}
    \frac{\Delta U_{\ell+1}}{T_{\ell+1}} - \frac{\Delta U_\ell}{T_\ell} = \ln\Lambda.
\end{equation}

For hierarchies with approximately constant barriers $\Delta U_\ell \approx \Delta U$ (e.g., similar decision complexities), we obtain the stratification rule:
\begin{equation}
    \boxed{\frac{T_\ell}{T_{\ell+1}} = \frac{\tau_{\ell+1}}{\tau_\ell} = \Lambda}.
    \label{eq:temperature-stratification-derived}
\end{equation}

\subsubsection{Physical Interpretation}

\textbf{Fast layers run hot.} Lower levels with rapid dynamics ($\tau_\ell$ small) require higher temperatures $T_\ell$ to maintain sufficient barrier crossing rates. This enables responsive adaptation to local conditions.

\textbf{Slow layers run cool.} Upper levels with slow dynamics ($\tau_{\ell+1}$ large) operate at lower temperatures $T_{\ell+1}$, promoting stability and preventing rapid policy oscillations at strategic scales.

\textbf{Hierarchical consistency.} The stratification \eqref{eq:temperature-stratification-derived} ensures that exploration intensity—measured by barrier crossing frequency relative to intrinsic timescale—remains comparable across levels. Without this matching, fast levels would freeze prematurely or slow levels would thrash chaotically.

\subsubsection{Initialization Protocol}

Given planning horizon $H$ (number of decision epochs before reoptimization), we initialize temperatures to ensure each level explores $O(1)$ regime transitions:
\begin{equation}
    T_\ell(0) = \frac{\Delta U_\ell}{k_B \ln(H/\tau_\ell^{\text{min}})},
    \label{eq:temperature-initialization}
\end{equation}
where $\tau_\ell^{\text{min}}$ is the minimum required residence time. This ensures $k_{\ell \to \ell'} \tau_\ell \sim H$, yielding $O(H/\tau_\ell)$ expected transitions within the planning horizon.

Combining initialization \eqref{eq:temperature-initialization} with adaptive cooling \eqref{eq:adaptive-cooling} and stratification \eqref{eq:temperature-stratification-derived} yields a thermodynamically consistent temperature schedule that respects both hierarchical structure and exploration-exploitation tradeoffs.

\subsubsection{Connection to Information-Thermodynamic Duality}

The temperature stratification also regulates information flow. Transfer entropy $TE_{\ell \to \ell+1}$ measures bits transmitted per timestep. Since higher levels operate slower ($\Delta t_{\ell+1} = \Lambda \Delta t_\ell$), the effective information rate
\begin{equation}
    \dot{I}_{\ell \to \ell+1} = \frac{TE_{\ell \to \ell+1}}{\Delta t_{\ell+1}} \propto \frac{1}{\Lambda}
\end{equation}
decreases with hierarchy level. This naturally implements information bottlenecks, compressing representations as information ascends while preserving coordination-relevant features.

Simultaneously, entropy production per unit wall-clock time
\begin{equation}
    \dot{\sigma}_\ell = \frac{\sigma_\ell}{\tau_\ell} \propto \frac{T_\ell}{\tau_\ell}
\end{equation}
remains approximately constant across levels under stratification $T_\ell / \tau_\ell \approx T_{\ell+1}/\tau_{\ell+1}$, ensuring uniform thermodynamic efficiency throughout the hierarchy.

\section{Renormalization Group Derivation of Universality Classes}
\label{app:rg-universality}

This appendix presents a detailed renormalization group (RG) analysis deriving the four universality classes of hierarchical cooperation systems. We employ Wilson's momentum-shell RG framework to compute one-loop beta functions, identify fixed points, and extract critical exponents through systematic $\varepsilon$-expansion near the upper critical dimension.

\subsection{Effective Field Theory and Ginzburg-Landau Hamiltonian}

We begin with a coarse-grained continuum description where the order parameter field $\phi_\ell(\mathbf{x})$ at level $\ell$ represents local magnetization (coordination strength) at spatial position $\mathbf{x} \in \mathbb{R}^d$. The effective Hamiltonian generalizes the standard $\phi^4$ theory to include hierarchical coupling:
\begin{equation}
    \mathcal{H}[\{\phi_\ell\}] = \sum_{\ell=1}^{L} \int \diff^d x \left[ \frac{1}{2}(\nabla \phi_\ell)^2 + \frac{r_\ell}{2}\phi_\ell^2 + \frac{u_\ell}{4!}\phi_\ell^4 \right] + \sum_{\ell < \ell'} \int \diff^d x \, \frac{\kappa_{\ell\ell'}}{2} \phi_\ell(\mathbf{x}) \phi_{\ell'}(\mathbf{x}),
    \label{eq:hierarchical-hamiltonian-app}
\end{equation}
where $r_\ell \propto (T - T_{c,\ell})$ is the reduced temperature at level $\ell$, $u_\ell > 0$ controls intra-level fluctuations, and $\kappa_{\ell\ell'} > 0$ couples different hierarchical levels.

\subsection{Wilson Renormalization Group Framework}

We implement momentum-shell RG by dividing Fourier modes into slow ($|\mathbf{k}| < \Lambda/b$) and fast ($\Lambda/b < |\mathbf{k}| < \Lambda$) components, where $\Lambda$ is the ultraviolet cutoff and $b > 1$ is the RG scale factor. The procedure consists of three steps:
\begin{enumerate}
    \item \textbf{Integrate out fast modes}: Perform functional integration over high-momentum fluctuations $\phi_\ell^{>}(\mathbf{k})$ with $|\mathbf{k}| \in [\Lambda/b, \Lambda]$.
    \item \textbf{Rescale coordinates}: Transform $\mathbf{x}' = \mathbf{x}/b$ to restore the original cutoff.
    \item \textbf{Rescale fields}: Define $\phi_\ell'(\mathbf{x}') = \phi_\ell^{<}(\mathbf{x})/b^{(d-2+\eta)/2}$ to maintain canonical kinetic term normalization.
\end{enumerate}

Under infinitesimal RG transformation $b = e^{\ell}$ with $\ell \ll 1$, the running couplings evolve according to beta functions.

\subsection{One-Loop Beta Functions}

Perturbative calculation to one-loop order in $u$ and $\kappa$ yields the following beta functions in $d = 4 - \varepsilon$ dimensions (where $\varepsilon$ is the deviation from upper critical dimension):

\textbf{Intra-level quartic coupling:}
\begin{equation}
    \beta_u = \frac{\diff u}{\diff \ell} = -\varepsilon u + \frac{3u^2}{16\pi^2} + O(u^3, u^2\kappa).
    \label{eq:beta-u-app}
\end{equation}
The first term $-\varepsilon u$ arises from engineering dimension analysis ($[u] = \varepsilon$ in $d = 4-\varepsilon$), while the positive $u^2$ term captures fluctuation renormalization from loop diagrams.

\textbf{Inter-level coupling:}
\begin{equation}
    \beta_\kappa = \frac{\diff \kappa}{\diff \ell} = -\frac{\varepsilon}{2}\kappa + \frac{u\kappa}{8\pi^2} + O(\kappa^3, u\kappa^2).
    \label{eq:beta-kappa-app}
\end{equation}

\textbf{Reduced temperature:}
\begin{equation}
    \beta_r = \frac{\diff r}{\diff \ell} = 2r + \frac{u}{8\pi^2} + \frac{\kappa^2}{8\pi^2}.
    \label{eq:beta-r-app}
\end{equation}

\subsection{Fixed Point Analysis and Universality Classes}

Fixed points $(u^*, \kappa^*)$ satisfy $\beta_u(u^*, \kappa^*) = 0$ and $\beta_\kappa(u^*, \kappa^*) = 0$. We identify four physically distinct fixed points corresponding to the universality classes.

\subsubsection{Class I: Independent Layers ($\kappa \to 0$)}

Setting $\kappa = 0$ eliminates inter-level coupling. The system reduces to $L$ independent $\phi^4$ theories, each with Wilson-Fisher fixed point:
\begin{equation}
    u_{\text{I}}^* = \frac{16\pi^2 \varepsilon}{3}, \quad \kappa_{\text{I}}^* = 0.
    \label{eq:fp-class-i-app}
\end{equation}

\textbf{Stability matrix:} The linearized RG flow near this fixed point is governed by
\begin{equation}
    M_{\text{I}} = \begin{pmatrix}
        \varepsilon & 0 \\
        0 & -\varepsilon/2 + u_{\text{I}}^*/(8\pi^2)
    \end{pmatrix} = \begin{pmatrix}
        \varepsilon & 0 \\
        0 & \varepsilon/6
    \end{pmatrix}.
\end{equation}
Eigenvalues are $\lambda_u = \varepsilon > 0$ (relevant) and $\lambda_\kappa = \varepsilon/6 > 0$ (relevant), confirming that the independent fixed point is unstable to inter-level perturbations.

\textbf{Critical exponents:} Standard $\varepsilon$-expansion yields (to first order):
\begin{align}
    \nu_{\text{I}} &= \frac{1}{2} + \frac{\varepsilon}{12} + O(\varepsilon^2) \approx 0.630 \quad (d=3), \\
    \gamma_{\text{I}} &= 1 + \frac{\varepsilon}{6} + O(\varepsilon^2) \approx 1.237 \quad (d=3), \\
    \beta_{\text{I}} &= \frac{1}{2} - \frac{3\varepsilon}{36} + O(\varepsilon^2) \approx 0.326 \quad (d=3).
\end{align}
These match 3D Ising universality, appropriate for organizational hierarchies with siloed, independent levels.

\subsubsection{Class II: Weak Inter-Level Coupling ($\kappa \ll u$)}

When $\kappa$ is small but nonzero, we treat inter-level coupling perturbatively. To leading order in $\delta\kappa$, the cross-level correlation function acquires corrections:
\begin{equation}
    \langle \phi_\ell(\mathbf{x}) \phi_{\ell'}(\mathbf{x}') \rangle \sim \frac{1}{|\mathbf{x} - \mathbf{x}'|^{2(d-2+\eta)}} \cdot |\ell - \ell'|^{-\Delta},
\end{equation}
where the anomalous dimension is
\begin{equation}
    \Delta = 2 - \frac{\kappa}{u \xi_\perp} + O(\kappa^2).
\end{equation}

Perturbative corrections shift critical exponents as
\begin{equation}
    \nu_{\text{II}} = \nu_{\text{I}}\left(1 + c_\kappa \frac{\kappa^2}{u^2}\right), \quad \gamma_{\text{II}} = \gamma_{\text{I}}\left(1 + c'_\kappa \frac{\kappa^2}{u^2}\right).
\end{equation}

\subsubsection{Class III: Strong Inter-Level Coupling ($\kappa \sim u$)}

When inter-level coupling becomes comparable to intra-level fluctuations, the system develops a new fixed point. Solving $\beta_u = 0$ and $\beta_\kappa = 0$ simultaneously:
\begin{align}
    -\varepsilon u^* + \frac{3(u^*)^2}{16\pi^2} &= 0 \quad \Rightarrow \quad u^* = \frac{16\pi^2 \varepsilon}{3}, \\
    -\frac{\varepsilon}{2}\kappa^* + \frac{u^* \kappa^*}{8\pi^2} &= 0 \quad \Rightarrow \quad \kappa^* = \sqrt{\frac{32\pi^2 \varepsilon}{3}}.
\end{align}

This gives the strongly coupled fixed point:
\begin{equation}
    u_{\text{III}}^* = \frac{16\pi^2 \varepsilon}{3}, \quad \kappa_{\text{III}}^* = \sqrt{\frac{32\pi^2 \varepsilon}{3}}.
    \label{eq:fp-class-iii-app}
\end{equation}

\textbf{Effective dimensionality:} Strong coupling across $L$ levels increases the effective spatial dimension:
\begin{equation}
    d_{\text{eff}} = d + (L - 1).
\end{equation}
For $d_{\text{eff}} > 4$ (achieved when $L > 5-d$), the system lies above the upper critical dimension and exhibits mean-field behavior:
\begin{equation}
    \beta_{\text{III}} = \frac{1}{2}, \quad \gamma_{\text{III}} = 1, \quad \nu_{\text{III}} = \frac{1}{2}.
\end{equation}

This describes tightly integrated command-and-control hierarchies where fluctuations are suppressed by strong top-down coordination.

\subsubsection{Class IV: Hierarchical Mixed Fixed Point}

A novel fixed point emerges at intermediate coupling characterized by logarithmic corrections to scaling. Critical exponents acquire logarithmic corrections:
\begin{align}
    \beta_{\text{IV}} &= \beta_0 + \frac{c_1}{L} + O(L^{-2}), \\
    \nu_{\text{IV}} &= \nu_0\left(1 + c_2 \frac{\ln L}{L}\right), \\
    \gamma_{\text{IV}} &= \gamma_0\left(1 + c_3 \frac{\ln L}{L}\right),
\end{align}
with universal constants $\{c_1, c_2, c_3\}$ characterizing hierarchical scaling.

\subsection{Eigenvalue Analysis: Relevant and Irrelevant Operators}

Near each fixed point, we linearize the RG flow to identify relevant (growing under RG), marginal (logarithmic), and irrelevant (decaying) operators. The stability matrix is
\begin{equation}
    M_{ij} = \left.\frac{\partial \beta_i}{\partial g_j}\right|_{g^*},
\end{equation}
where $\{g_i\} = \{r, u, \kappa, \ldots\}$ are coupling constants.

\textbf{Class I (Independent):}
\begin{equation}
    M_{\text{I}} = \begin{pmatrix}
        2 & 1/(8\pi^2) & 0 \\
        0 & \varepsilon & 0 \\
        0 & 0 & \varepsilon/6
    \end{pmatrix}.
\end{equation}
Eigenvalues: $\lambda_r = 2$ (temperature, relevant), $\lambda_u = \varepsilon$ (quartic coupling, relevant), $\lambda_\kappa = \varepsilon/6$ (inter-level coupling, relevant). All positive eigenvalues confirm instability.

\textbf{Class III (Strong coupling):}
\begin{equation}
    M_{\text{III}} = \begin{pmatrix}
        2 & u^*/(8\pi^2) & \kappa^*/(4\pi^2) \\
        0 & \varepsilon & \kappa^*/(8\pi^2) \\
        0 & \kappa^*/(8\pi^2) & \varepsilon/2
    \end{pmatrix}.
\end{equation}
The thermal eigenvalue $\lambda_r = 2$ remains relevant. The coupled $u$-$\kappa$ sector has eigenvalues
\begin{equation}
    \lambda_\pm = \frac{3\varepsilon}{4} \pm \sqrt{\left(\frac{\varepsilon}{4}\right)^2 + \left(\frac{\kappa^*}{8\pi^2}\right)^2}.
\end{equation}
Both positive, confirming the fixed point attracts RG flows in the critical surface.

\subsection{Scaling Relations}

Critical exponents satisfy hyperscaling and Josephson identities:
\begin{align}
    \alpha + 2\beta + \gamma &= 2 \quad \text{(Rushbrooke)}, \\
    \gamma &= \nu(2 - \eta) \quad \text{(Fisher)}, \\
    d\nu &= 2 - \alpha \quad \text{(Josephson hyperscaling)}.
\end{align}

For hierarchical systems, we introduce a new scaling relation involving the level number $L$:
\begin{equation}
    \beta_H = \beta_0 + \frac{c_1}{L} + O(L^{-2}),
\end{equation}
verified numerically in Class IV simulations.

\subsection{Finite-Size Scaling Predictions}

In finite systems with linear size $N$ at level $\ell$, the order parameter exhibits finite-size corrections. The magnetization scales as
\begin{equation}
    m_N(T) = N^{-\beta/\nu} \tilde{m}(N^{1/\nu}(T - T_c)),
\end{equation}
where $\tilde{m}(x)$ is a universal scaling function.

At criticality ($T = T_c$), observables follow power laws:
\begin{align}
    \langle |m_\ell| \rangle_N &\sim N^{-\beta/\nu}, \\
    \chi_N &\sim N^{\gamma/\nu}, \\
    \text{Binder cumulant:} \quad U_N &= 1 - \frac{\langle m_\ell^4 \rangle}{3\langle m_\ell^2 \rangle^2} \to U^* \quad \text{(universal)}.
\end{align}

For each universality class:
\begin{itemize}
    \item \textbf{Class I}: $U^* \approx 0.465$ (3D Ising)
    \item \textbf{Class III}: $U^* = 2/3$ (mean-field)
    \item \textbf{Class IV}: $U^*(L) = U_0 + c_U/\ln L$ (logarithmic corrections)
\end{itemize}

Data collapse plots of $N^{\beta/\nu} m_N$ versus $N^{1/\nu}(T - T_c)$ provide direct validation of predicted exponents and universality class identification. The RG framework thus provides both fundamental understanding and practical diagnostic tools for hierarchical coordination systems.

\input{appendix_early_warning}

\section*{Acknowledgements}
We thank contributors to the Hierarchical Cooperation project and reviewers who stress-tested the simulation harness. We are grateful for insights from Prof. Erwin Frey's nonequilibrium field theory course at Ludwig-Maximilians-Universität München, which inspired several theoretical extensions.

